library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
##
## intersect
library(SeuratData)
## ── Installed datasets ──────────────────────────────── SeuratData v0.2.2.9001 ──
## ✔ ifnb 3.1.0 ✔ pbmc3k 3.1.4
## ✔ lungref 2.0.0
## ────────────────────────────────────── Key ─────────────────────────────────────
## ✔ Dataset loaded successfully
## ❯ Dataset built with a newer version of Seurat than installed
## ❓ Unknown version of Seurat installed
library(patchwork)
library(Azimuth)
## Registered S3 method overwritten by 'SeuratDisk':
## method from
## as.sparse.H5Group Seurat
##
## Attaching shinyBS
library(sctransform)
library(ggplot2)
library(hdf5r)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reticulate)
library(DESeq2) # if want to perform DE using this option
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
##
## intersect
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:hdf5r':
##
## values
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:sp':
##
## %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
##
## Assays
## The following object is masked from 'package:SeuratObject':
##
## Assays
##
## Attaching package: 'DESeq2'
## The following object is masked from 'package:sctransform':
##
## vst
library(reticulate)
library(BPCells)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ IRanges::collapse() masks dplyr::collapse()
## ✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
## ✖ matrixStats::count() masks dplyr::count()
## ✖ IRanges::desc() masks dplyr::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ S4Vectors::first() masks dplyr::first()
## ✖ purrr::flatten_df() masks hdf5r::flatten_df()
## ✖ dplyr::lag() masks stats::lag()
## ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ S4Vectors::rename() masks dplyr::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ IRanges::slice() masks dplyr::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /vf/users/pengl7/IRF/sc-20250318/seurat_HDAC11_mouse_bladder
use_condaenv("/data/pengl7/conda/envs/r4seurat")
py_config()
## python: /vf/users/pengl7/conda/envs/r4seurat/bin/python
## libpython: /data/pengl7/conda/envs/r4seurat/lib/libpython3.11.so
## pythonhome: /vf/users/pengl7/conda/envs/r4seurat:/vf/users/pengl7/conda/envs/r4seurat
## version: 3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:34:09) [GCC 12.3.0]
## numpy: /vf/users/pengl7/conda/envs/r4seurat/lib/python3.11/site-packages/numpy
## numpy_version: 1.25.2
##
## NOTE: Python version was forced by use_python() function
# need to first create a conda env as r4seurat; conda active r4seurat; conda install leidenalg; conda install numpy pandas; start rstudio&
leidenalg <- import("leidenalg")
merged <- readRDS(here("results", "merged_filtered_percentile_filtered.rds"))
merged
## An object of class Seurat
## 23319 features across 30252 samples within 1 assay
## Active assay: RNA (23319 features, 0 variable features)
## 4 layers present: counts.1KO, counts.1WT, counts.2KO, counts.2WT
head(merged@meta.data)
## orig.ident nCount_RNA nFeature_RNA sample percent.mt
## 1KO_AAACCAAAGGCGATAG-1 1KO 578 377 1KO 0.1730104
## 1KO_AAACCAAAGTTGCTGA-1 1KO 2947 1669 1KO 6.4472345
## 1KO_AAACCCGCAAGCGCGT-1 1KO 1745 1071 1KO 0.6303725
## 1KO_AAACCCTGTTCGCACA-1 1KO 3828 2071 1KO 0.4963427
## 1KO_AAACCCTGTTGGACCC-1 1KO 6814 3087 1KO 0.1174053
## 1KO_AAACCGCTCAGCCATA-1 1KO 10135 3324 1KO 0.1282684
table(merged$orig.ident) # or check other metadata columns like merged$condition
##
## 1KO 1WT 2KO 2WT
## 7187 8334 7095 7636
merged$condition <- ifelse(grepl("WT", merged$orig.ident), "WT", "KO")
merged$replicate <- ifelse(grepl("1", merged$orig.ident), "1", "2")
head(merged$condition)
## 1KO_AAACCAAAGGCGATAG-1 1KO_AAACCAAAGTTGCTGA-1 1KO_AAACCCGCAAGCGCGT-1
## "KO" "KO" "KO"
## 1KO_AAACCCTGTTCGCACA-1 1KO_AAACCCTGTTGGACCC-1 1KO_AAACCGCTCAGCCATA-1
## "KO" "KO" "KO"
VlnPlot(merged, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), group.by = "orig.ident", pt.size = 0.1)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
nCount_RNA VS nFeature_RNA A strong correlation here is typical. Outliers (too many counts but few features) might be doublets or dead cells.
FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
merged <- NormalizeData(merged)
## Normalizing layer: counts.1KO
## Normalizing layer: counts.1WT
## Normalizing layer: counts.2KO
## Normalizing layer: counts.2WT
merged <- FindVariableFeatures(merged, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts.1KO
## Finding variable features for layer counts.1WT
## Finding variable features for layer counts.2KO
## Finding variable features for layer counts.2WT
merged <- ScaleData(merged, vars.to.regress = "percent.mt") # Optional regression
## Regressing out percent.mt
## Centering and scaling data matrix
merged <- RunPCA(merged)
## PC_ 1
## Positive: Sh3gl2, Ankfn1, Mecom, Trp63, Ctnnd2, Ttc6, Pkhd1, 9530036O11Rik, Gm20528, Gata3
## Dock8, 2610035D17Rik, Tmprss2, Rab27b, Foxq1, Fmo5, Mir205hg, Crybg1, Gm49890, Gpr39
## Paqr5, Gm13219, Ugt1a6a, Rbm47, Ikzf2, Fer1l4, AC106834.1, Syt16, Tiam1, D730045B01Rik
## Negative: Bnc2, Cacnb2, Slc8a1, Slit3, Arhgap6, Carmn, Nrp2, Plcl1, Cacna1c, Sgcd
## Adam33, Ror1, Zfpm2, Synpo2, Itga1, Kcnq5, Adgrd1, Sdk1, Magi2, Lsamp
## Fbxl7, Pgm5, Fam129a, Dmd, Pdlim3, Myom1, Filip1, Pdzrn3, Sorbs1, Col25a1
## PC_ 2
## Positive: Carmn, Col25a1, Myom1, Gm11946, Sntg2, Chrm3, Ryr3, Dgki, Hcn1, Kcnma1
## Synpo2, Rbfox3, Sorbs1, Pdlim3, Lrrc7, Myocd, Slc24a3, P2rx1, Dmd, Fhod3
## Cacna1c, Dtna, Art3, Itga1, Stac, Cap2, Akap6, Cacna2d3, Kcnn2, Dock3
## Negative: Zfp521, Pid1, Sox5, Col14a1, Xdh, Abi3bp, Lama2, Kcnt2, Cdk14, Plxdc2
## Gas7, Ebf1, Bicc1, Cfh, Cped1, Pdgfra, Cd55, Grk5, Fndc1, Man1a
## Gxylt2, Cgnl1, Apbb1ip, Fbln1, Dab2, Cntn4, Dcn, Cd34, Il1rapl1, Fbn1
## PC_ 3
## Positive: Gpc6, Bicc1, Sox5, Adamtsl1, Cntn4, Col14a1, Dlc1, Lama2, Zfpm2, Tenm3
## Meg3, Cd55, Abi3bp, Adamtsl3, Ghr, Pdgfra, Col3a1, Col5a2, Pdzrn3, Fbln1
## Gxylt2, Sdk1, Kcnt2, Plxdc2, Magi2, Cdh11, Bnc2, Fndc1, Il1rapl1, Adamts2
## Negative: Dock2, Ptprc, Inpp5d, Arhgap15, Gm26740, Myo1f, March1, Ikzf1, Nrros, Ly86
## Cyth4, Epsti1, Mrc1, Tbxas1, Adgre1, F13a1, Arhgap45, F630028O10Rik, Cd84, Fli1
## St8sia4, Aoah, Mir142hg, Mctp1, Slc9a9, Arhgap30, Pirb, Pik3ap1, Tm6sf1, Lilrb4a
## PC_ 4
## Positive: Adgrf5, Egfl7, Cyyr1, Ptprb, Pecam1, Emcn, Adgrl4, Erg, Ldb2, Rasgrf2
## Ushbp1, Dipk2b, Shank3, Flt1, Cdh5, St6galnac3, Kank3, Sncaip, Sema6a, Abcb1a
## Tll1, Vwf, Kdr, Palmd, Ablim3, Prkch, Podxl, Abcg2, Sox17, Tek
## Negative: Pid1, Zeb2, Apbb1ip, Arhgap15, Dock2, Ptprc, Slc9a9, March1, Gm26740, Myo1f
## Dock10, Arhgap45, Gpc6, Ly86, Adgre1, F13a1, Tbxas1, F630028O10Rik, Mrc1, Cntn4
## Rnf150, Cyth4, Epsti1, Abca9, Lama2, Ikzf1, Aoah, Ifi207, Cd84, Mitf
## PC_ 5
## Positive: Ebf1, Prex2, Cped1, Fgd5, Lama2, Ptprb, Tek, Pdgfra, Jam2, Cntn4
## Pecam1, Cyyr1, Egfl7, Flt1, Hspa12b, Lhfpl2, Rnf144a, Dnm3, Emcn, Gsn
## Pi16, Calcrl, Adgrl4, Erg, Myrip, Cxcl12, Adam12, Ldb2, Scara5, Ntn1
## Negative: Muc16, Pcnx2, Wt1, Rspo1, Wt1os, Kcnd2, Gpm6a, Wdr17, Bnc1, Pkhd1l1
## Dpp4, Ptprq, Asic2, Pcdh15, Fgf1, Ddo, Msln, Cybrd1, Ptger3, Sntg1
## Slc39a8, Smpd3, Atp6v0a4, Nxph1, Prss12, Lrrn4, Il16, Col4a4, Dnajc6, Cldn15
ElbowPlot(merged, ndims = 50)
merged <- RunUMAP(merged, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:02:14 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:14 Read 30252 rows and found 20 numeric columns
## 14:02:14 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:14 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:02:16 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c12daf3cb0
## 14:02:16 Searching Annoy index using 1 thread, search_k = 3000
## 14:02:24 Annoy recall = 100%
## 14:02:25 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:02:27 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:02:28 Commencing optimization for 200 epochs, with 1351298 positive edges
## 14:02:38 Optimization finished
DimPlot(merged, label = TRUE)
merged <- RunUMAP(merged, dims = 1:30)
## 14:02:39 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:39 Read 30252 rows and found 30 numeric columns
## 14:02:39 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:39 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:02:41 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c139b9fecf
## 14:02:41 Searching Annoy index using 1 thread, search_k = 3000
## 14:02:48 Annoy recall = 100%
## 14:02:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:02:51 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:02:53 Commencing optimization for 200 epochs, with 1364094 positive edges
## 14:03:03 Optimization finished
DimPlot(merged, label = TRUE)
# chose PCs=30 to perform clustering
merged <- RunUMAP(merged, dims = 1:30)
## 14:03:04 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:03:04 Read 30252 rows and found 30 numeric columns
## 14:03:04 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:03:04 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:03:06 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c175f67d64
## 14:03:06 Searching Annoy index using 1 thread, search_k = 3000
## 14:03:13 Annoy recall = 100%
## 14:03:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:03:17 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:03:18 Commencing optimization for 200 epochs, with 1364094 positive edges
## 14:03:28 Optimization finished
merged <- FindNeighbors(merged, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
merged <- FindClusters(merged, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30252
## Number of edges: 1164289
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9400
## Number of communities: 20
## Elapsed time: 5 seconds
Visualize clusters:
DimPlot(merged, group.by = "seurat_clusters", label = TRUE)
merged <- FindClusters(merged, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30252
## Number of edges: 1164289
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9480
## Number of communities: 18
## Elapsed time: 4 seconds
DimPlot(merged, group.by = "seurat_clusters", label = TRUE)
merged <- FindClusters(merged, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30252
## Number of edges: 1164289
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9578
## Number of communities: 15
## Elapsed time: 5 seconds
DimPlot(merged, group.by = "seurat_clusters", label = TRUE)
#install.packages("clustree")
library(clustree)
## Loading required package: ggraph
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
##
## geometry
clustree(merged, prefix = "RNA_snn_res.")
Need to join layers before performing FindAllMarkers since I merged 4 samples together which keep their indiviaul data and count slots
merged <- JoinLayers(
object = merged,
assay = "RNA",
slot = "data", # or "counts" if you're starting from raw counts
new.layer = "joined_normalized" # name for the new combined layer
)
Before JoinLayer 4 layers present: counts.1KO, counts.1WT, counts.2KO, counts.2WT After JoinLayer: 3 layers present: data, counts, scale.data
merged
## An object of class Seurat
## 23319 features across 30252 samples within 1 assay
## Active assay: RNA (23319 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 2 dimensional reductions calculated: pca, umap
Use marker genes to check whether these clusters represent distinct biological identities:
Idents(merged) <- "seurat_clusters"
markers <- FindAllMarkers(
merged,
only.pos = TRUE,
min.pct = 0.1,
logfc.threshold = 0.1
)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(merged, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(merged, features = top10$gene): The following features
## were omitted as they were not found in the scale.data slot for the RNA assay:
## BC035947, A530053G22Rik, Sucnr1, Gm454, Gm13403, Gm12688, Gm10475, Duox2,
## Arid3c, Ugt2b1, Gm26777, Skint7, Gm32772, Slc9a4, Mroh3, Zfp872, Elfn1, Dsp,
## Uchl1, Krt8, Krt5, Gm36535, Gm24474, Gm34639, Baiap2l2, 5830418P13Rik, Gm44737,
## Vwa5b2, Gm12295, Zp3r, Antxrl, B230312C02Rik, 4933413L06Rik, Ckm, Hspb6, Flnc,
## Myl9, Synm
visualize specific genes
FeaturePlot(merged, features = c("Upk3a", "Acta2", "Col1a1", "Cd3e", "Cd79a"))
DotPlot(merged, features = c("Upk3a", "Acta2", "Col1a1", "Cd3e", "Cd79a")) + RotatedAxis()
DotPlot(merged, features = list(
Urothelial = c("Upk3a", "Krt20", "Krt8"),
SmoothMuscle = c("Acta2", "Tagln", "Myh11"),
Fibroblast = c("Col1a1", "Dcn", "Pdgfra"),
Tcell = c("Cd3e", "Cd3d", "Trac"),
Bcell = c("Cd79a", "Ms4a1"),
Macrophage = c("Lyz2", "Adgre1")
)) + RotatedAxis()
## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
VlnPlot(merged, features = c("Acta2", "Col1a1", "Cd3e"), group.by = "seurat_clusters", pt.size = 0.1)
Plot UMAP by batch/sample/condition
# Color by sample identity
DimPlot(merged, group.by = "orig.ident", label = TRUE, repel = TRUE)
# Optional: split UMAP by sample or condition
DimPlot(merged, split.by = "orig.ident", group.by = "seurat_clusters")
DimPlot(merged, split.by = "condition", group.by = "seurat_clusters")
This helps quantify how balanced each cluster is across samples:
# Table of cluster × sample counts
tbl <- table(merged$seurat_clusters, merged$orig.ident)
# Reorder samples: WT1, WT2, KO1, KO2
sample_order <- c("1WT", "2WT", "1KO", "2KO") # adjust based on your actual sample names
# Reorder the table columns
tbl <- tbl[, sample_order]
# Colors for clusters — use a consistent vector across plot & legend
cluster_colors <- 1:nrow(tbl) # or you can define manually, like c("black", "red", ...)
# Proportions per sample
tbl_prop <- prop.table(tbl, margin = 2)
# Plot: That means 4 colors (recycled for each group of bars)
par(mar = c(5, 4, 4, 8), xpd = TRUE)
barplot(tbl_prop, beside = TRUE, col = 1:4,
main = "Sample composition per cluster",
ylab = "Proportion")
# Legend
legend("topright", inset = c(-0.2, 0),
legend = rownames(tbl),
fill = cluster_colors,
title = "Cluster",
cex = 0.8)
# Normalize proportions per cluster
tbl_prop <- prop.table(tbl, margin = 2)
# Set layout so there's space on the right for the legend
par(mar = c(5, 4, 4, 8), xpd = TRUE)
# Barplot with room for the legend
barplot(tbl_prop,
beside = TRUE,
col = rainbow(nrow(tbl)),
main = "Sample composition per cluster",
ylab = "Proportion",
las = 2)
# Add legend to the right
legend("topright", inset = c(-0.2, 0), legend = rownames(tbl),
fill = rainbow(nrow(tbl)), title = "Cluster", cex = 0.8)
as.data.frame(tbl) %>%
ggplot(aes(x = Var1, y = Freq, fill = Var2)) +
geom_bar(stat = "identity", position = "fill") +
labs(x = "Cluster", y = "Proportion", fill = "Sample") +
theme_minimal()
saveRDS(merged, file = here("results", "layerJoined_clusted.rds"))